{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 为了实现决策树，需建立树结点的类\n",
    "# 定义树结点\n",
    "class Node:\n",
    "    def __init__(self, theta, index, value=None):\n",
    "        self.theta = theta     # 划分的阈值\n",
    "        self.index = index     # 选用的维度\n",
    "        self.value = value     # 根节点的值\n",
    "        self.leftNode = None\n",
    "        self.rightNode = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 定义Gini系数---作为每个子集“好坏”的衡量标准\n",
    "def gini(Y):\n",
    "    l = Y.shape[0]\n",
    "    if l == 0:\n",
    "        return 1\n",
    "    return 1-(np.sum(Y==1)/l)**2-(np.sum(Y==-1)/l)**2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 为了便于实现，找出每一维度下的最佳划分阈值和对应的branch值 --- 但这样实现代价是运行速度\n",
    "# 单维情况下的最佳树桩---大于等于为1类\n",
    "def one_stump(X, Y, thres):\n",
    "    l = thres.shape[0]\n",
    "    mini = Y.shape[0]\n",
    "    for i in range(l):\n",
    "        Y1 = Y[X<thres[i]]\n",
    "        Y2 = Y[X>=thres[i]]\n",
    "        judge = Y1.shape[0]*gini(Y1)+Y2.shape[0]*gini(Y2)\n",
    "        if mini>judge:\n",
    "            mini = judge; b = thres[i]\n",
    "    return mini, b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 找出最佳划分的阈值和对应的维度\n",
    "# 结合全部维数的决策树桩\n",
    "def decision_stump(X, Y):\n",
    "    row, col = X.shape\n",
    "    Xsort = np.sort(X, 0)\n",
    "    thres = (np.r_[Xsort[0:1, :]-0.1, Xsort]+np.r_[Xsort, Xsort[-1:, :]+0.1])/2\n",
    "    mpurity = row; mb = 0; index = 0\n",
    "    for i in range(col):\n",
    "        purity, b = one_stump(X[:, i], Y[:, 0], thres[:, i])\n",
    "        if mpurity > purity:\n",
    "            mpurity = purity; mb = b; index = i\n",
    "    return mb, index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 定义划分终止的条件\n",
    "# 终止条件\n",
    "def stop_cond(X, Y):\n",
    "    if np.sum(Y!=Y[0])==0 or X.shape[0]==1 or np.sum(X!=X[0, :])==0:\n",
    "        return True\n",
    "    return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 定义完全生长的决策树\n",
    "def dTree(X, Y):\n",
    "    if stop_cond(X, Y):\n",
    "        node = Node(None, None, Y[0])\n",
    "        return node\n",
    "    b, index = decision_stump(X, Y)\n",
    "    pos1 = X[:, index] < b; pos2 = X[:, index] >= b\n",
    "    leftX = X[pos1, :]; leftY = Y[pos1, 0:1]\n",
    "    rightX = X[pos2, :]; rightY = Y[pos2, 0:1]\n",
    "    node = Node(b, index)\n",
    "    node.leftNode = dTree(leftX, leftY)\n",
    "    node.rightNode = dTree(rightX, rightY)\n",
    "    return node"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 定义只进行一次划分的决策树（夸张的剪枝）\n",
    "def dTree_one(X, Y):\n",
    "    b, index = decision_stump(X, Y)\n",
    "    pos1 = X[:, index] < b; pos2 = X[:, index] >= b\n",
    "    node = Node(b, index)\n",
    "    value1 = 1 if np.sign(np.sum(Y[pos1]))>=0 else -1\n",
    "    value2 = 1 if np.sign(np.sum(Y[pos2]))>=0 else -1\n",
    "    node.leftNode = Node(None, None, np.array([value1]))\n",
    "    node.rightNode = Node(None, None, np.array([value2]))\n",
    "    return node"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 预测函数---基于决策树对单个样本进行的预测\n",
    "def predict_one(node, X):\n",
    "    if node.value is not None:\n",
    "        return node.value[0]\n",
    "    thre = node.theta; index = node.index\n",
    "    if X[index] < thre:\n",
    "        return predict_one(node.leftNode, X)\n",
    "    else:\n",
    "        return predict_one(node.rightNode, X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 基于决策树的预测结果及其错误率衡量函数\n",
    "def err_fun(X, Y, node):\n",
    "    row, col = X.shape\n",
    "    Yhat = np.zeros(Y.shape)\n",
    "    for i in range(row):\n",
    "        Yhat[i] = predict_one(node, X[i, :])\n",
    "    return Yhat, np.sum(Yhat!=Y)/row"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# bagging函数\n",
    "def bagging(X, Y):\n",
    "    row, col = X.shape\n",
    "    pos = np.random.randint(0, row, (row,))\n",
    "    return X[pos, :], Y[pos, :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 随机森林算法---没有加入feature的随机选择\n",
    "def random_forest(X, Y, T):\n",
    "    nodeArr = []\n",
    "    for i in range(T):\n",
    "        Xtemp, Ytemp = bagging(X, Y)\n",
    "        node = dTree(Xtemp, Ytemp)\n",
    "        nodeArr.append(node)\n",
    "    return nodeArr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 基于剪枝后的随机森林算法\n",
    "def random_forest_pruned(X, Y, T):\n",
    "    nodeArr = []\n",
    "    for i in range(T):\n",
    "        Xtemp, Ytemp = bagging(X, Y)\n",
    "        node = dTree_one(Xtemp, Ytemp)\n",
    "        nodeArr.append(node)\n",
    "    return nodeArr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# ----------------具体题目-------------------\n",
    "# 加载数据函数\n",
    "def loadData(filename):\n",
    "    data = pd.read_csv(filename, sep='\\s+', header=None)\n",
    "    data = data.as_matrix()\n",
    "    col, row = data.shape\n",
    "    X = data[:, 0: row-1]\n",
    "    Y = data[:, row-1:row]\n",
    "    return X, Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 导入数据\n",
    "X, Y = loadData('hw3_train.dat')\n",
    "Xtest, Ytest = loadData('hw3_test.dat')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "完全生长的决策树内部结点数目： 10\n"
     ]
    }
   ],
   "source": [
    "# Q13\n",
    "# 定义一个搜索树有多少结点的函数---叶子结点不计入\n",
    "def internal_node(node):\n",
    "    if node == None:\n",
    "        return 0\n",
    "    if node.leftNode == None and node.rightNode == None:\n",
    "        return 0\n",
    "    l = 0; r = 0\n",
    "    if node.leftNode != None:\n",
    "        l = internal_node(node.leftNode)\n",
    "    if node.rightNode != None:\n",
    "        r = internal_node(node.rightNode)\n",
    "    return 1 + l + r\n",
    "\n",
    "node = dTree(X, Y)\n",
    "print('完全生长的决策树内部结点数目：', internal_node(node))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ein:  0.0 \n",
      "Eout:  0.126\n"
     ]
    }
   ],
   "source": [
    "# Q14 and Q15\n",
    "_, ein = err_fun(X, Y, node)\n",
    "_, eout = err_fun(Xtest, Ytest, node)\n",
    "print('Ein: ', ein, '\\nEout: ', eout)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ein(gt)的平均： 0.0518873333333\n",
      "Ein(G):  0.0\n",
      "Eout(G):  0.07452\n"
     ]
    }
   ],
   "source": [
    "# Q16,Q17,Q18\n",
    "ein = 0; eout = 0; err = 0\n",
    "for j in range(50):\n",
    "    nodeArr = random_forest(X, Y, 300)\n",
    "    l = len(nodeArr)\n",
    "    yhat1 = np.zeros((Y.shape[0], l))\n",
    "    yhat2 = np.zeros((Ytest.shape[0], l))\n",
    "    for i in range(l):\n",
    "        yhat1[:, i:i+1], _ = err_fun(X, Y, nodeArr[i])\n",
    "        yhat2[:, i:i+1], _ = err_fun(Xtest, Ytest, nodeArr[i])\n",
    "    errg = np.sum(yhat1!=Y, 0)/Y.shape[0]\n",
    "    Yhat = np.sign(np.sum(yhat1, 1)).reshape(Y.shape)\n",
    "    Ytesthat = np.sign(np.sum(yhat2, 1)).reshape(Ytest.shape)\n",
    "    Yhat[Yhat == 0] = 1; Ytesthat[Ytesthat == 0] = 1\n",
    "    ein += np.sum(Yhat!=Y)/Y.shape[0]\n",
    "    eout += np.sum(Ytesthat!=Ytest)/Ytest.shape[0]\n",
    "    err += np.sum(errg)/l\n",
    "print('Ein(gt)的平均：', err/50)\n",
    "print('Ein(G): ', ein/50)\n",
    "print('Eout(G): ', eout/50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ein:  0.1106\n",
      "Eout:  0.15336\n"
     ]
    }
   ],
   "source": [
    "# Q19, Q20\n",
    "ein = 0; eout = 0\n",
    "for j in range(50):\n",
    "    nodeArr = random_forest_pruned(X, Y, 300)\n",
    "    l = len(nodeArr)\n",
    "    yhat1 = np.zeros((Y.shape[0], l))\n",
    "    yhat2 = np.zeros((Ytest.shape[0], l))\n",
    "    for i in range(l):\n",
    "        yhat1[:, i:i + 1], _ = err_fun(X, Y, nodeArr[i])\n",
    "        yhat2[:, i:i + 1], _ = err_fun(Xtest, Ytest, nodeArr[i])\n",
    "    Yhat = np.sign(np.sum(yhat1, 1)).reshape(Y.shape)\n",
    "    Ytesthat = np.sign(np.sum(yhat2, 1)).reshape(Ytest.shape)\n",
    "    Yhat[Yhat == 0] = 1;\n",
    "    Ytesthat[Ytesthat == 0] = 1\n",
    "    ein += np.sum(Yhat != Y) / Y.shape[0]\n",
    "    eout += np.sum(Ytesthat != Ytest) / Ytest.shape[0]\n",
    "print('Ein: ', ein/50)\n",
    "print('Eout: ', eout/50)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
